This script is used to produce the analyses of the paper. Our data is preprocessed in the data_processing.Rmd file.
Last edit: 30/03/2024
##Setup
Fitting exponential and logistic PCR models to qPCR data (SybrGreen) of Capsella bursa-pastoris. Data and script are described in Supplementary File 1.
Converting RFU to number of molecules for illustrative purpose (scaling).
#Open file containing qPCR data
ampli <- read.csv("data/20210712_101200_CT030043_SPER01_ANALYSE - Quantification Amplification Results_SYBR.csv")[,-1]
ampli <- ampli %>% pivot_longer(2:ncol(ampli))
colnames(ampli)[2:3]<- c("Well","RFU")
ampli <- ampli[ampli$Well == "B7",] #B7: species Cbp, sample 26
K <- 1.32e11 #Charge capacity, computed below
#Convert RFU to molecules
a <- K/(max(ampli$RFU)-min(ampli$RFU))
b <- -a*min(ampli$RFU)
ampli$molecules <- a * ampli$RFU + b
data_ampli <- as.matrix(ampli[,c("Cycle", "RFU")])
data_ampli[,1] <- data_ampli[,1] - 1
ncycles <- nrow(ampli)
Parameters are fitted numerically. No biological consideration here. The function (pcrfit) and the data structure used come from R package qpcR.
Non-zero low weights are given for numerical reasons.
mexp <- cm3 #Reuse model object from package qPCR
#Define model
mexp$expr <- "Fluo ~ mexp$fct(Cycles, c(M0, Lam))"
mexp$fct <- function(x, parm) {
M0 <- parm[1]
Lam <- parm[2]
Fn <- vector(mode = "numeric", length = length(x))
for (i in 1:length(x)) {
if (i == 1) Fn[i] <- M0
else {
Fn[i] <- Fn[i - 1] * (1 + Lam*(1-Fn[i - 1]/K))
}
}
return(Fn)
}
mexp$ssFct <- function(x, y) {
## start estimates
M0 <- 1
Lam <- 1
ssVal <- c(M0, Lam)
names(ssVal) <- mexp$parnames
return(ssVal)
}
mexp$inv <- function(y, parm) {
x <- 1:100
fn <- function(x, parm) mexp$fct(x, parm) - y
uniroot(fn, interval = c(1, 100), parm)$root
}
mexp$expr.grad <- expression(mexp$fct(Cycles, c(M0, Lam)))
mexp$parnames <- c("M0", "Lam")
mexp$name <- "mexp"
mexp$type <- "my exponential model"
#Weight cycles
weights_exp <- c(rep(0, 5), rep(1, 9), rep(1000, 6), rep(0, 41))
fitexp <- pcrfit(data_ampli, 1, 2, mexp,
weights = weights_exp)
res_exp <- a*fitexp$m$predict(newdata = fitexp$DATA) + b
mlog <- cm3 #Reuse model object from package qPCR
#Define model
mlog$expr <- "Fluo ~ mlog$fct(Cycles, c(M0, Lam))"
mlog$fct <- function(x, parm) {
M0 <- parm[1]
Lam <- parm[2]
K <- max(ampli$RFU)
Fn <- vector(mode = "numeric", length = length(x))
for (i in 1:length(x)) {
if (i == 1) Fn[i] <- M0
else {
if (Fn[i - 1] < K){Fn[i] <- Fn[i - 1] * (1 + Lam*(1-Fn[i - 1]/K)) }
else {Fn[i] <- Fn[i - 1]}
}
}
return(Fn)
}
mlog$ssFct <- function(x, y) {
## start estimates
M0 <- 1
Lam <- 1
K <- max(ampli$RFU)
ssVal <- c(M0, Lam)
names(ssVal) <- mlog$parnames
return(ssVal)
}
mlog$inv <- function(y, parm) {
x <- 1:100
fn <- function(x, parm) mlog$fct(x, parm) - y
uniroot(fn, interval = c(1, 100), parm)$root
}
mlog$expr.grad <- expression(mlog$fct(Cycles, c(M0, Lam)))
mlog$parnames <- c("M0", "Lam")
mlog$name <- "mlog"
mlog$type <- "my logistic model"
#Weight cycles
weights_log <- c(rep(0, 5), rep(1, 9), rep(1000, 10), rep(1, 15), rep(0, 22))
fitlog <- pcrfit(data_ampli, 1, 2, mlog,
weights = weights_log)
res_log <- a*fitlog$m$predict(newdata = fitlog$DATA) + b
ggplot()+
geom_point(data = ampli, aes(Cycle-1, molecules))+
geom_line(data = NULL, aes(0:60, res_log,
col = "Logistic"),
linewidth = 0.75)+
geom_point(data = ampli, aes(Cycle-1, molecules))+
geom_line(data = NULL, aes(0:60, res_exp,
col = "Exponential"),
linewidth = 0.75)+
labs(x = "Cycle",
y = "Estimated number of molecules",
col = "Amplification model")+
theme(legend.justification = c(1, 0),
legend.position = c(1, 0),
legend.box.margin=margin(rep(10, 4)))+
scale_colour_brewer(palette = "Set1")+
# scale_y_log10()+
coord_cartesian(xlim = c(0, 50), ylim = c(1, K))
#+
# ggtitle(
# paste0("Real qPCR data and saturation models - Species : ",
# "Capsella bursa-pastoris"))+
# theme(plot.title = element_text(hjust = 0.5))
# ggsave(filename = "Figures/saturation_models2.pdf",
# dpi = 600, units = "mm",
# width = 150, height = 100, device = "pdf")
Dataframe summary_experiment contains all info concerning the concentration assays.
#QuantaSoft output file with additional data
dataddpcr <- read_csv("data/dataddpcr.csv")
## Rows: 85 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Well, Status, sp
## dbl (8): Concentration, Negatives, AcceptedDroplets, Sample, Dilu, plate, To...
## lgl (1): Used
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Used columns:
# Sample: Number of the plant sample
# Concentration: Assayed copies/ul in well
# sp: species
# TotalDNA: Total DNA concentration in ddPCR wells, ng/ul
#Open extraction data
#Used columns:
# species name
# Sample: Number of the plant sample
# CDNA_sample: Concentration of total DNA in the sample assayed by Qubit, ng/ul
extraction <- read.csv("data/extraction.csv")
Vdna_dd <- 5 #Volume of DNA in a well in ul
Vmix_dd <- 20 #Volume of mix in a well in ul
#summary_experiment contains all info concerning the concentration assays
#Molecules_ng is the number of target molecules / ng of total DNA
summary_experiment <- dataddpcr %>% group_by(Sample) %>%
summarise(sp = sp[1],
Molecules_ng = mean(Concentration/TotalDNA)) %>%
left_join(extraction) %>%
dplyr::select(Sample, species_name, sp, Molecules_ng, CDNA_sample)
## Joining with `by = join_by(Sample)`
DNA concentration in the different communities
CDNA_U <- 0.5
#Total DNA concentration in ng/ul for all plants
#in the U community (concentration in sample, not in pcr mix)
C_species_ref <- CDNA_U/13 #concentration of each species, ng/ul
summary_experiment <- summary_experiment %>%
mutate(Volume_equi = max(Molecules_ng)/Molecules_ng,
cDNA_U = C_species_ref*Volume_equi,
#Concentration of total DNA to use in the U commu
molecules_U = cDNA_U*Molecules_ng, #Molecules/ul in sample
propU = molecules_U/sum(molecules_U), #species proportion
Volume_equi = max(Molecules_ng)/Molecules_ng,
#Volume to take to be as concentrated as 1 ul of Ptr 6 (most concentrated)
RankG = 13 - rank(CDNA_sample/Volume_equi) + 1)
Number of molecules in the M_U community
VDNA_metab <- 2 #Volume of DNA used in the mix, ul
VDNA_metab * summary_experiment$molecules_U #Molecules of each species in mix
## [1] 19084.31 19084.31 19084.31 19084.31 19084.31 19084.31 19084.31 19084.31
## [9] 19084.31 19084.31 19084.31 19084.31 19084.31
VDNA_metab * sum(summary_experiment$molecules_U) #Total number of molecules
## [1] 248096
Number of molecules in the M_T community
summary_experiment <- summary_experiment %>%
mutate(molecules_T = C_species_ref*Molecules_ng, #molecules/ul of sample
propT = molecules_T/sum(molecules_T)) #species proportions
VDNA_metab * summary_experiment$molecules_T #Molecules of each species in mix
## [1] 11760.000 19084.308 4051.282 6695.385 5672.308 6896.410 10360.237
## [8] 9659.487 16280.000 2964.103 6330.256 2883.282 10806.154
VDNA_metab * sum(summary_experiment$molecules_T) #total number of molecules
## [1] 113443.2
Number of molecules in the M_G community (2-fold dilution)
summary_experiment <- summary_experiment %>%
mutate(Volume_equi = max(Molecules_ng)/Molecules_ng,
cDNA_G = CDNA_U/2^RankG*Volume_equi,
molecules_G = cDNA_G*Molecules_ng,
propG = molecules_G/sum(molecules_G))
summary_experiment$molecules_G * VDNA_metab #Molecules of each species in mix
## [1] 7753.00000 15506.00000 121.14063 1938.25000 3876.50000
## [6] 60.57031 969.12500 124048.00000 62024.00000 484.56250
## [11] 242.28125 30.28516 31012.00000
sum(summary_experiment$molecules_G) * VDNA_metab #Total number of molecules
## [1] 248065.7
Information about the barcodes used with Sper01 marker.
barcodes <- read_excel("data/new_plant_positive_control.xlsx")[,c(2,4:6)]
## New names:
## • `` -> `...1`
names(barcodes) <- c("species_name", "sequence", "length", "gc_content")
barcodes[barcodes$species_name ==
"Salvia_pratensis",]$sequence <- "atcctgttttctcaaaacaaaggttcaaaaaacgaaaaaaaaaaag"
barcodes <- barcodes %>% filter(species_name != "Rumex_acetosa")
#Species not used in this experiment
#Information about barcodes and extracted concentrations
barcodes %>% left_join(extraction[,-2])
## Joining with `by = join_by(species_name)`
## # A tibble: 30 × 5
## species_name sequence length gc_content CDNA_sample
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Taxus_baccata atccgtattataggaacaataatttta… 41 24.4 7.94
## 2 Taxus_baccata atccgtattataggaacaataatttta… 41 24.4 13.3
## 3 Salvia_pratensis atcctgttttctcaaaacaaaggttca… 45 26.7 20.8
## 4 Salvia_pratensis atcctgttttctcaaaacaaaggttca… 45 26.7 24.4
## 5 Populus_tremula atcctatttttcgaaaacaaacaaaaa… 68 25 33
## 6 Populus_tremula atcctatttttcgaaaacaaacaaaaa… 68 25 31.4
## 7 Carpinus_betulus atcctgttttcccaaaacaaataaaac… 61 27.9 13.1
## 8 Carpinus_betulus atcctgttttcccaaaacaaataaaac… 61 27.9 9.14
## 9 Fraxinus_excelsior atcctgttttcccaaaacaaaggttca… 39 33.3 6.56
## 10 Fraxinus_excelsior atcctgttttcccaaaacaaaggttca… 39 33.3 22.4
## # ℹ 20 more rows
Target DNA concentrations of each sample
data_dd_fig <- dataddpcr %>%
left_join(summary_experiment %>% dplyr::select(sp, species_name, RankG))
## Joining with `by = join_by(sp)`
order_short <- summary_experiment %>% dplyr::select(sp, RankG) %>%
arrange(-RankG) %>% pull(sp)
highlight <- function(x, pat, color="black", family="") {
ifelse(grepl(pat, x), glue("<b style='font-family:{family}; color:{color}'>{x}</b>"), x)
}
ggplot(data_dd_fig)+
geom_point(aes(factor(sp, levels = order_short, ordered = T),
Concentration_copies_ng/1000,
shape = as.factor(TotalDNA)))+
geom_point(data = summary_experiment,
aes(as.factor(sp), Molecules_ng/1000),
fill = "red", shape = 23, size = 3)+
# ggtitle("Target DNA molecules at given Total DNA concentration")+
labs(x = "Plant species",
y = "Thousands of molecules per ng of total DNA\n",
shape = TeX("Concentration (ng/$\\mu l$)"))+
scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))+
# theme_bw()+
theme(axis.text.x=element_markdown(),
legend.title.align=0.5,
legend.text.align = 0.5)
# ggsave(filename = "Figures/ddpcr_fig_clean.pdf",
# dpi = 600, units = "mm",
# width = 150, height = 100, device = "pdf")
# ggsave(filename = "Figures/ddpcr_fig_clean.png",
# dpi = 600, units = "mm",
# width = 150, height = 90)
Statistics from ddPCR assays:
Species Populus tremula and Rhododendron ferrugineum
data_dd_fig %>% filter(sp == "Ptr") %>%
pull(Concentration_copies_ng) %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 236800 240000 251200 248096 255520 256960
data_dd_fig %>% filter(sp == "Ptr") %>%
pull(Concentration_copies_ng) %>% sd
## [1] 9171.504
data_dd_fig %>% filter(sp == "Rfe") %>%
pull(Concentration_copies_ng) %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33152 33680 34912 37483 37176 50720
data_dd_fig %>% filter(sp == "Rfe") %>%
pull(Concentration_copies_ng) %>% sd
## [1] 6695.412
summary_experiment$Molecules_ng %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37483 73740 89653 113443 140480 248096
Pipeline from raw fastq data: see data_processing.Rmd file (we used the obitools softwares).
Open data, filter bad PCR replicates:
df_Sper01 <- read_csv("data/df_Sper01.csv")
## Rows: 74708 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Sample, id, sequence, Community, species_name
## dbl (8): obiclean_weight, reads, Spike_level, total_count_sample, count, seq...
## lgl (1): NegControl
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary_sample <- df_Sper01 %>%
group_by(Sample) %>%
summarise(total_count_sample = sum(reads),
NegControl = NegControl[1],
Spike_level = Spike_level[1],
Community = Community[1]) %>%
arrange(total_count_sample)
summary(summary_sample$total_count_sample)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 414 12247 31505 35305 54730 108166
summary(summary_sample %>%
filter(!NegControl) %>% pull(total_count_sample))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1068 13952 33712 37023 55612 108166
sd(summary_sample %>%
filter(!NegControl) %>% pull(total_count_sample))
## [1] 26763.09
reads_threshold <- 5000 #Replicates with less than 5000 reads are removed
ggplot(summary_sample %>%
filter(!NegControl))+
geom_point(aes(x = 1:nrow(summary_sample %>%
filter(!NegControl)),
y = total_count_sample,
shape = Community))+
labs(col = "Spike_level")+
scale_y_log10()+
geom_hline(yintercept = reads_threshold)
Observed and expected proportions in final composition.
Community M_U
df_targetU <- df_Sper01 %>%
filter(Community == "U" &
species_name %in% barcodes$species_name &
total_count_sample > reads_threshold &
!NegControl) %>%
mutate(prop = obiclean_weight/total_count_sample) %>%
left_join(summary_experiment %>% dplyr::select(species_name, sp, propU)) %>%
rename(prop_expected = propU)
## Joining with `by = join_by(species_name)`
Community M_T
df_targetT <- df_Sper01 %>%
filter(Community == "T" &
species_name %in% barcodes$species_name &
total_count_sample > reads_threshold &
!NegControl) %>%
mutate(prop = obiclean_weight/total_count_sample) %>%
left_join(summary_experiment %>% dplyr::select(species_name, sp, propT)) %>%
rename(prop_expected = propT)
## Joining with `by = join_by(species_name)`
Community M_G
df_targetG <- df_Sper01 %>%
filter(Community == "G" &
species_name %in% barcodes$species_name &
total_count_sample > reads_threshold &
!NegControl) %>%
mutate(prop = obiclean_weight/total_count_sample) %>%
left_join(summary_experiment %>% dplyr::select(species_name, sp, propG)) %>%
rename(prop_expected = propG)
## Joining with `by = join_by(species_name)`
colp <- c("Expected" = "gold",
"Expected\nwithout\nPCR bias" = "gold",
"Expected\nwithout\nPCR and\nconcentration\nbiases" = "gold")
#"Inferred\nwith model" = "darkolivegreen2")
gobsU1 <- ggplot(df_targetU)+
geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
prop))+
geom_point(data = summary_experiment,
aes(sp, 1/nrow(summary_experiment),
fill = "Expected\nwithout\nPCR bias"),
shape = 23,
size = 3)+
geom_hline(aes(yintercept = 1/nrow(summary_experiment),
col = "Expected\nwithout\nPCR and\nconcentration\nbiases"),
linetype = "dashed")+
ggtitle(TeX("Uniform community (${{M}_U}$)"))+
labs(y = "", x = "", fill = "", col = "")+
ylim(min(df_targetT$prop), max(df_targetT$prop)) +
theme(plot.title = element_text(hjust = 0.5),
axis.text.x=element_markdown(),
legend.title.align=0.5)+
scale_color_manual(values = colp)+
scale_fill_manual(values = colp)+
scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))
gobsT1 <- ggplot(df_targetT)+
geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
prop))+
geom_point(data = summary_experiment,
aes(sp, propT, fill = "Expected\nwithout\nPCR bias"),
shape = 23,
size = 3)+
geom_hline(aes(yintercept = 1/nrow(summary_experiment),
col = "Expected\nwithout\nPCR and\nconcentration\nbiases"),
linetype = "dashed")+
ggtitle(TeX("Uniform in Total DNA community (${{M}_T}$)"))+
labs(y = "", x = "", fill = "", col = "")+
theme(plot.title = element_text(hjust = 0.5),
axis.text.x=element_markdown(),
legend.title.align=0.5)+
scale_color_manual(values = colp)+
scale_fill_manual(values = colp)+
scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))
# gobsG1 <- ggplot(df_targetG)+
# geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
# prop))+
# labs(y = "", x = "")+
# ggtitle("Geometric community (G)")+
# theme(plot.title = element_text(hjust = 0.5))+
# geom_point(data = summary_experiment,
# aes(sp, propG), shape = 23,
# fill = "gold", size = 3)+
# scale_y_log10()
fig <- ggpubr::ggarrange(gobsU1, gobsT1, #gobsG1,
ncol = 1,
common.legend = T,
legend = "right")
annotate_figure(fig,
# top = text_grob("Final abundances in two mock communities with Sper01"),
bottom = text_grob("Species"),
left = text_grob("Observed reads proportions", rot = 90))
# ggsave("Figures/metabar_results.pdf", units = "mm",
# width = 150, height = 200, dpi = 600, device = "pdf")
Examples of proportions
Species Populus tremula
df_targetU %>% filter(sp == "Ptr") %>% pull(prop) %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.03278 0.07092 0.08630 0.08618 0.10383 0.13607
df_targetU %>% filter(sp == "Ptr") %>% pull(prop) %>% sd
## [1] 0.02675534
df_targetU %>% group_by(sp) %>% summarise(s = sd(prop), m = mean(prop)) %>%arrange(m)
## # A tibble: 13 × 3
## sp s m
## <chr> <dbl> <dbl>
## 1 Gro 0.00462 0.0241
## 2 Spr 0.00849 0.0267
## 3 Lxy 0.00417 0.0550
## 4 Aal 0.0301 0.0579
## 5 Bme 0.00964 0.0608
## 6 Fex 0.00629 0.0626
## 7 Lco 0.00770 0.0796
## 8 Cbe 0.0122 0.0809
## 9 Aca 0.0207 0.0827
## 10 Ptr 0.0268 0.0862
## 11 Rfe 0.0154 0.101
## 12 Cbp 0.0129 0.106
## 13 Rca 0.00872 0.171
df_targetG %>% group_by(sp) %>% summarise(s = sd(prop), m = mean(prop)) %>%arrange(m)
## # A tibble: 13 × 3
## sp s m
## <chr> <dbl> <dbl>
## 1 Aal 0.0000299 0.0000509
## 2 Rfe 0.000123 0.000177
## 3 Gro 0.000141 0.000228
## 4 Cbe 0.000131 0.000312
## 5 Aca 0.000473 0.00166
## 6 Cbp 0.000485 0.00189
## 7 Spr 0.00206 0.00625
## 8 Fex 0.00150 0.00676
## 9 Lxy 0.00181 0.00962
## 10 Ptr 0.0228 0.0551
## 11 Lco 0.00849 0.148
## 12 Bme 0.0386 0.366
## 13 Rca 0.0136 0.400
This approach considers that the factor bias is independent to the whole community composition.
Correction factor computed from community M_U is:
a_s = prop_obs(s)/prop_init(s) (* constant value)
then prop_infer_mock(s) = prop_obs(s)/a_s(M_U)
where prop_infer_mock are the inferred proportions established by this ratio correction.
order_sp <- summary_experiment %>% dplyr::select(species_name, RankG) %>%
arrange(-RankG) %>% pull(species_name)
correcU <- df_targetU %>%
group_by(species_name) %>%
summarise(correc = median(prop)/prop_expected[1], sp = sp[1])
#M_U
df_targetU <- df_targetU %>% left_join(correcU)
## Joining with `by = join_by(species_name, sp)`
df_targetU$prop_infer_mock <- df_targetU$prop/df_targetU$correc
df_targetU <- df_targetU %>%
group_by(Sample) %>%
mutate(prop_infer_mock = prop_infer_mock/sum(prop_infer_mock,
na.rm = T))
df_targetU$species_name <- factor(df_targetU$species_name,
levels = order_sp,
ordered = T)
#M_T
df_targetT <- df_targetT %>% left_join(correcU)
## Joining with `by = join_by(species_name, sp)`
df_targetT$prop_infer_mock <- df_targetT$prop/df_targetT$correc
df_targetT <- df_targetT %>%
group_by(Sample) %>%
mutate(prop_infer_mock = prop_infer_mock/sum(prop_infer_mock,
na.rm = T))
df_targetT$species_name <- factor(df_targetT$species_name,
levels = order_sp,
ordered = T)
#M_G
df_targetG <- df_targetG %>% left_join(correcU)
## Joining with `by = join_by(species_name, sp)`
df_targetG$prop_infer_mock <- df_targetG$prop/df_targetG$correc
df_targetG <- df_targetG %>%
group_by(Sample) %>%
mutate(prop_infer_mock = prop_infer_mock/sum(prop_infer_mock,
na.rm = T))
df_targetG$species_name <- factor(df_targetG$species_name,
levels = order_sp,
ordered = T)
# ginferT1 <- ggplot(df_targetT)+
# geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
# prop_infer_mock))+
# geom_point(data = df_targetT %>% group_by(sp) %>%
# summarise(prop = median(prop)),
# aes(sp, prop, fill = "Observed\n(Median)"),
# shape = 21, size = 3)+
# theme(axis.text.x=element_blank())+
# labs(y = "", x = "",
# fill = "", col = "")+
# geom_point(data = summary_experiment,
# aes(sp, propT, fill = "Expected"), shape = 23,
# size = 3)+
# scale_fill_manual(values = colp)+
# ggtitle("Uniform in Total DNA community (T)")+
# theme(plot.title = element_text(hjust = 0.5))
#
# ginferG1 <- ggplot(df_targetG)+
# geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
# prop_infer_mock))+
# geom_point(data = df_targetG %>% group_by(sp) %>%
# summarise(prop = median(prop)),
# aes(sp, prop, fill = "Observed\n(Median)"),
# shape = 21, size = 3)+
# labs(y = "", x = "",
# fill = "", col = "")+
# geom_point(data = summary_experiment,
# aes(sp, propG, fill = "Expected"), shape = 23,
# size = 3)+
# scale_fill_manual(values = colp)+
# ggtitle("Geometric community (G)")+
# theme(plot.title = element_text(hjust = 0.5),
# legend.position = "none")+
# scale_y_log10()+
# scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))+
# theme(axis.text.x=element_markdown())
#
# fig <- egg::ggarrange(ginferT1, ginferG1,
# ncol = 1)
# annotate_figure(fig,
# top = text_grob("Corrected abundances in two mock communities with Sper01"),
# bottom = text_grob("Species"),
# left = text_grob("Inferred reads proportions", rot = 90))
# ggsave("Figures/metabar_results_corrected.pdf", units = "mm",
# width = 180, height = 150, dpi = 600, device = "pdf")
Merge the results
resU1 <- df_targetU %>% group_by(species_name) %>%
summarise(prop = median(prop), prop_expected = prop_expected[1],
prop_infer_mock = median(prop_infer_mock))
resT1 <- df_targetT %>% group_by(species_name) %>%
summarise(prop = median(prop), prop_expected = prop_expected[1],
prop_infer_mock = median(prop_infer_mock))
resG1 <- df_targetG %>% group_by(species_name) %>%
summarise(sp = sp[1], prop = median(prop), prop_expected = prop_expected[1],
prop_infer_mock = median(prop_infer_mock))
Export data to Julia to infer efficiencies with flimo
readsU <- df_targetU %>%
dplyr::select(Sample, obiclean_weight, species_name) %>%
pivot_wider(names_from = species_name,
values_from = obiclean_weight) %>% ungroup %>% dplyr::select(-Sample)
qty_initU <- left_join(data.frame(species_name = colnames(readsU)),
summary_experiment %>%
dplyr::select(species_name, molecules_U)) %>%
pull(molecules_U)
## Joining with `by = join_by(species_name)`
# write_csv(readsU, "data/reads_U.csv")
# write_csv(data.frame(qty_initU), "data/qty_initU.csv")
readsT <- df_targetT %>%
dplyr::select(Sample, obiclean_weight, species_name) %>%
pivot_wider(names_from = species_name,
values_from = obiclean_weight) %>% ungroup %>% dplyr::select(-Sample)
qty_initT <- left_join(data.frame(species_name = colnames(readsU)),
summary_experiment %>%
dplyr::select(species_name, molecules_T)) %>%
pull(molecules_T)
## Joining with `by = join_by(species_name)`
# write_csv(readsT, "data/reads_T.csv")
# write_csv(data.frame(qty_initT), "data/qty_initT.csv")
readsG <- df_targetG %>%
dplyr::select(Sample, obiclean_weight, species_name) %>%
pivot_wider(names_from = species_name,
values_from = obiclean_weight,
values_fill = 0) %>% ungroup %>% dplyr::select(-Sample)
qty_initG <- left_join(data.frame(species_name = colnames(readsU)),
summary_experiment %>%
dplyr::select(species_name, molecules_G)) %>%
pull(molecules_G)
## Joining with `by = join_by(species_name)`
# write_csv(readsG, "data/reads_G.csv")
# write_csv(data.frame(qty_initG), "data/qty_initG.csv")
Proportions inferred in Julia using PCR efficiencies
Run metabar_bias_functions.jl (functions), inference_pcr_efficiencies_ps.jl (inference of efficiencies from M_U and then proportions in M_T and M_G) and inference_taqman.jl (processing the taqman analysis)
Inference with the flimo method - reads in M_U used to infer efficiencies Lambda_s and K - then efficiencies used to infer proportions in M_T and M_G
#Inferred efficiencies
efficiencies_infer <- t(read_csv("data/efficiencies_U.csv"))
## Rows: 13 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): Column1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
colnames(efficiencies_infer) <- colnames(readsU)
summary_experiment <- summary_experiment %>%
left_join(efficiencies_infer %>%
as_tibble() %>%
pivot_longer(1:13, names_to = "species_name",
values_to = "efficiencies"))
## Joining with `by = join_by(species_name)`
Loading inferred proportions computed in Julia
inferT <- read.csv("data/prop_inferT.csv")
colnames(inferT) <- colnames(readsT)
resT1 <- resT1 %>%
left_join(inferT %>% pivot_longer(everything(),
names_to = "species_name",
values_to = "prop_infer_Lambda")) %>%
arrange(-dplyr::row_number())
## Joining with `by = join_by(species_name)`
inferG <- read.csv("data/prop_inferG.csv")
colnames(inferG) <- colnames(readsG)
resG1 <- resG1 %>%
left_join(inferG %>% pivot_longer(everything(),
names_to = "species_name",
values_to = "prop_infer_Lambda")) %>%
arrange(-dplyr::row_number())
## Joining with `by = join_by(species_name)`
In this section, efficiencies of 3 species are estimated from a Taqman qPCR assay.
Open data (prepared in data_processing.Rmd)
res_taqman <- read.csv("data/res_taqman.csv")
kinetics <- read.csv("data/taqman_kinetics.csv")
Plot amplification curves and Cq = f(concentration)
ggplot(kinetics)+
geom_line(aes(Cycle, RFU, group = Well, col = probe))
ggplot(res_taqman)+
geom_point(aes(copies_ul, Cq, group = Well, col = probe))+
scale_x_log10()+
labs(y="Ct")+
ggtitle("Ct = f(log(Concentration))")
Vmix_taq <- 25 #ul
res_taqman$slope <- NA
res_taqman$intercept <- NA
for (s in unique(res_taqman$probe)){
reg <- lm(Cq ~ log10(copies),
res_taqman %>%
filter(probe == s) %>%
group_by(Dilu) %>%
summarise(Cq = mean(Cq),
copies = Vmix_taq*copies_ul[1]))
res_taqman[res_taqman$probe == s, "slope"] <- reg$coefficients[2]
res_taqman[
res_taqman$probe == s, "intercept"] <- reg$coefficients[1]
}
res_taqman$rate <- 10^(-1/res_taqman$slope)-1
res_taqman$Mct <- 10^(-res_taqman$intercept/res_taqman$slope)
res_taqman %>% group_by(probe) %>%
summarise(rate = rate[1],
Mct = Mct[1]/1e11)
## # A tibble: 4 × 3
## probe rate Mct
## <chr> <dbl> <dbl>
## 1 CbeA 0.985 1.76
## 2 CbeB 0.979 2.31
## 3 Cbp 0.968 1.08
## 4 Fex 0.930 1.26
Try to infer with only 2 out of the 3 replicates : too important variability
flam <- function(p){
10^(-1/p)-1
}
res_lm_test <- NULL
for (s in unique(res_taqman$probe)){
for (try in 1:100){
reg <- lm(Cq ~ log10(copies),
res_taqman %>%
filter(probe == s & !Neg) %>%
group_by(Dilu) %>%
summarise(Cq = mean(Cq[sample(1:n(), 2)]),
copies = Vmix_taq*copies_ul[1]))
res_lm_test <- rbind(res_lm_test,
data.frame(probe = s,
slope = reg$coefficients[2]))
}
}
res_lm_test %>% group_by(probe) %>% summarise(min = min(slope),
max = max(slope)) %>%
mutate(lmin = flam(min), lmax = flam(max), gap = lmax - lmin)
## # A tibble: 4 × 6
## probe min max lmin lmax gap
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CbeA -3.39 -3.31 0.970 1.01 0.0367
## 2 CbeB -3.40 -3.34 0.970 0.993 0.0222
## 3 Cbp -3.45 -3.36 0.950 0.986 0.0356
<<<<<<< HEAD
## 4 Fex -3.57 -3.45 0.905 0.949 0.0444
=======
## 4 Fex -3.57 -3.45 0.905 0.950 0.0447
>>>>>>> eb7f3054bbf0321141b9b1a66ddeff779bd19d4a
RFU Threshold for each well
res_taqman$RFU_Cq <- NA
for (i in 1:nrow(res_taqman)){
aux <- kinetics %>% filter(Dilu == res_taqman[i,]$Dilu &
Repli == res_taqman[i,]$Repli &
probe == res_taqman[i,]$probe &
abs(Cycle - res_taqman[i,]$Cq)<1)
res_taqman[i,"RFU_Cq"] <- (aux[2,]$RFU - aux[1,]$RFU)*
(res_taqman[i,]$Cq-aux[1,]$Cycle)+aux[1,]$RFU
}
res_taqman <- res_taqman %>%
mutate(K = Mct/(RFU_Cq/End.RFU))
Mct_glob <- res_taqman %>%
group_by(sp) %>%
summarise(Mct = mean(Mct)) %>% pull(Mct) %>% mean
K <- res_taqman %>%
group_by(sp) %>%
summarise(K = mean(K)) %>% pull(K) %>% sum
efficiencies <- res_taqman %>%
mutate(Lambda = (Mct_glob/(Vmix_taq*copies_ul))^(1/Cq)-1)
ggplot(efficiencies)+
geom_point(aes(probe, Lambda, col = factor(Dilu)))
efficiencies %>% group_by(probe) %>% summarise(Lambda = mean(Lambda))
## # A tibble: 4 × 2
## probe Lambda
## <chr> <dbl>
## 1 CbeA 0.972
## 2 CbeB 0.947
## 3 Cbp 0.989
## 4 Fex 0.940
efficiencies %>% group_by(sp) %>% summarise(Lambda = mean(Lambda))
## # A tibble: 3 × 2
## sp Lambda
## <chr> <dbl>
## 1 Cbe 0.960
## 2 Cbp 0.989
## 3 Fex 0.940
#lam = Lambdas = efficiencies:
lam <- efficiencies %>% group_by(sp) %>%
summarise(efficiency = mean(Lambda),
sdeff = sd(Lambda))
res_taqman <- res_taqman %>% left_join(lam)
## Joining with `by = join_by(sp)`
To compare the 2 probes for Carpinus betulus
x <- res_taqman %>% filter(sp == "Cbe" & Dilu < 6) %>%
group_by(probe, Dilu) %>%
summarise(end.rfu = mean(End.RFU), Cq = mean(Cq))
## `summarise()` has grouped output by 'probe'. You can override using the
## `.groups` argument.
correcCbe <- x %>% group_by(Dilu, probe) %>%
summarise(end.rfu = mean(end.rfu)) %>%
pivot_wider(names_from = probe, values_from = end.rfu) %>%
mutate(correc = CbeA/CbeB)
## `summarise()` has grouped output by 'Dilu'. You can override using the
## `.groups` argument.
y <- kinetics %>% filter(sp == "Cbe" & !Neg) %>% left_join(correcCbe) %>%
mutate(RFUbis = ifelse(probe == "CbeB", RFU*correc, RFU))
## Joining with `by = join_by(Dilu)`
ggplot(y)+
geom_line(aes(Cycle, RFU, col = probe, group = Well,
linetype = factor(Dilu)))
ggplot(y)+
geom_line(aes(Cycle, RFUbis, col = probe, group = Well,
linetype = factor(Dilu)))
z <- res_taqman %>% filter(sp == "Cbe" & !Neg)
z$RFU_Cq <- NA
for (i in 1:nrow(z)){
a <- kinetics %>% filter(Dilu == z[i,]$Dilu &
Repli == z[i,]$Repli &
probe == z[i,]$probe &
abs(Cycle - z[i,]$Cq)<1)
z[i,"RFU_Cq"] <- (a[2,]$RFU - a[1,]$RFU)*
(z[i,]$Cq-a[1,]$Cycle)+a[1,]$RFU
}
seuil_Cq <- mean(z$RFU_Cq)
fCq <- function(ampli){
c2 <- which(ampli >= seuil_Cq)[1]
c1 <- c2-1
Cq <- (seuil_Cq - ampli[c1]+(ampli[c2] - ampli[c1])*c1)/
(ampli[c2] - ampli[c1])
Cq
}
res <- NULL
for (p in unique(y$Well)){
res <- rbind(res,
data.frame(Well = p,
probe = y[y$Well == p,]$probe[1],
Cq = fCq(y[y$Well == p,]$RFU),
Cqbis = fCq(y[y$Well == p,]$RFUbis)))
}
ggplot(res_taqman %>% filter(!Neg & sp == "Cbe"))+
geom_point(aes(Dilu, Cq, col = probe))
Comparing the 3 species
qmet <- df_targetU %>%
filter(species_name %in% c("Capsella_bursa-pastoris",
"Carpinus_betulus",
"Fraxinus_excelsior"))
ggplot(qmet)+
geom_boxplot(aes(species_name,
prop))+
theme(axis.text.x = element_text(angle = 90))
qmet %>% group_by(species_name) %>%
summarise(prop = mean(prop)) %>% mutate(prop = prop/sum(prop))
## # A tibble: 3 × 2
## species_name prop
## <ord> <dbl>
## 1 Carpinus_betulus 0.325
## 2 Capsella_bursa-pastoris 0.424
## 3 Fraxinus_excelsior 0.251
Summarising previous information
resU1 %>% dplyr::select(species_name, prop_expected, prop)
## # A tibble: 13 × 3
## species_name prop_expected prop
## <ord> <dbl> <dbl>
## 1 Rhododendron_ferrugineum 0.0769 0.105
## 2 Abies_alba 0.0769 0.0531
## 3 Carpinus_betulus 0.0769 0.0850
## 4 Geranium_robertianum 0.0769 0.0256
## 5 Capsella_bursa-pastoris 0.0769 0.108
## 6 Acer_campestre 0.0769 0.0803
## 7 Fraxinus_excelsior 0.0769 0.0613
## 8 Lonicera_xylosteum 0.0769 0.0550
## 9 Salvia_pratensis 0.0769 0.0261
## 10 Populus_tremula 0.0769 0.0863
## 11 Lotus_corniculatus 0.0769 0.0785
## 12 Rosa_canina 0.0769 0.172
## 13 Briza_media 0.0769 0.0578
#Efficiencies by Taqman
lam
## # A tibble: 3 × 3
## sp efficiency sdeff
## <chr> <dbl> <dbl>
## 1 Cbe 0.960 0.0141
## 2 Cbp 0.989 0.00501
## 3 Fex 0.940 0.0108
#Normalized efficiencies
efficiencies %>% group_by(probe) %>%
summarise(efficiency = mean(Lambda),
sdeff = sd(Lambda)) %>%
mutate(efficiency = efficiency/lam$efficiency[3]*0.905,
sdeff = sdeff/lam$efficiency[3]*0.905)
## # A tibble: 4 × 3
## probe efficiency sdeff
## <chr> <dbl> <dbl>
## 1 CbeA 0.936 0.00540
## 2 CbeB 0.912 0.00588
## 3 Cbp 0.953 0.00482
## 4 Fex 0.905 0.0104
summary_experiment %>% arrange(RankG) %>%
dplyr::select(species_name, efficiencies)
## # A tibble: 13 × 2
## species_name efficiencies
## <chr> <dbl>
## 1 Briza_media 0.902
## 2 Rosa_canina 1
## 3 Lotus_corniculatus 0.927
## 4 Populus_tremula 0.935
## 5 Salvia_pratensis 0.827
## 6 Lonicera_xylosteum 0.892
## 7 Fraxinus_excelsior 0.905
## 8 Acer_campestre 0.931
## 9 Capsella_bursa-pastoris 0.954
## 10 Geranium_robertianum 0.818
## 11 Carpinus_betulus 0.929
## 12 Abies_alba 0.897
## 13 Rhododendron_ferrugineum 0.950
Inferred proportions in M_T and M_G
Inferred with the ratio method or the new approach in this paper
resT1
## # A tibble: 13 × 5
## species_name prop prop_expected prop_infer_mock prop_infer_Lambda
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Briza_media 0.0620 0.0851 0.0842 0.0829
## 2 Rosa_canina 0.262 0.144 0.123 0.125
## 3 Lotus_corniculatus 0.114 0.0953 0.115 0.116
## 4 Populus_tremula 0.210 0.168 0.198 0.203
## 5 Salvia_pratensis 0.0385 0.104 0.118 0.114
## 6 Lonicera_xylosteum 0.0269 0.0500 0.0385 0.0387
## 7 Fraxinus_excelsior 0.0429 0.0590 0.0568 0.0563
## 8 Acer_campestre 0.0793 0.0913 0.0781 0.0818
## 9 Capsella_bursa-pastor… 0.0322 0.0261 0.0245 0.0248
## 10 Geranium_robertianum 0.0209 0.0558 0.0653 0.0704
## 11 Carpinus_betulus 0.0276 0.0357 0.0258 0.0269
## 12 Abies_alba 0.0145 0.0608 0.0227 0.0212
## 13 Rhododendron_ferrugin… 0.0506 0.0254 0.0371 0.0389
resG1
## # A tibble: 13 × 6
## species_name sp prop prop_expected prop_infer_mock prop_infer_Lambda
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Briza_media Bme 3.64e-1 0.500 0.536 0.530
## 2 Rosa_canina Rca 4.01e-1 0.250 0.200 0.202
## 3 Lotus_cornicul… Lco 1.48e-1 0.125 0.160 0.162
## 4 Populus_tremula Ptr 5.20e-2 0.0625 0.0516 0.0557
## 5 Salvia_pratens… Spr 6.03e-3 0.0313 0.0199 0.0210
## 6 Lonicera_xylos… Lxy 9.38e-3 0.0156 0.0147 0.0155
## 7 Fraxinus_excel… Fex 6.84e-3 0.00781 0.00955 0.00951
## 8 Acer_campestre Aca 1.60e-3 0.00391 0.00170 0.00176
## 9 Capsella_bursa… Cbp 1.92e-3 0.00195 0.00151 0.00154
## 10 Geranium_rober… Gro 1.95e-4 0.000977 0.000643 0.000849
## 11 Carpinus_betul… Cbe 3.03e-4 0.000488 0.000306 0.000337
## 12 Abies_alba Aal 4.50e-5 0.000244 0.0000718 0.0000470
## 13 Rhododendron_f… Rfe 1.45e-4 0.000122 0.000119 0.000141
Computation of neff for M_U (number of effective exponential cycles) with the truncated exponential model
The K value used is the one selected during the inference in flimo.
summary_experiment <- summary_experiment %>%
left_join(resU1 %>% dplyr::select(species_name, prop) %>%
rename(prop_obsU = prop)) %>%
left_join(resT1 %>% dplyr::select(species_name, prop) %>%
rename(prop_obsT = prop)) %>%
left_join(resG1 %>% dplyr::select(species_name, prop) %>%
rename(prop_obsG = prop))
## Joining with `by = join_by(species_name)`
## Joining with `by = join_by(species_name)`
## Joining with `by = join_by(species_name)`
K <- 1.32e11
#mg stands for Geometric Mean
mg_lam_s_p1 <- exp(mean(log(summary_experiment$efficiencies+1))) #Mean of Lambda_s + 1
#M_U
mg_psU <- exp(mean(log(summary_experiment$prop_obsU))) #Mean of relative abundances ps
mg_m0sU <- exp(mean(log(VDNA_metab*summary_experiment$molecules_U))) #molecules/ug ; Mean of initial numbers of molecules m0s
neffU <- (log(mg_psU)+log(K)-log(mg_m0sU))/log(mg_lam_s_p1)
#M_T
mg_psT <- exp(mean(log(summary_experiment$prop_obsT)))
mg_m0sT <- exp(mean(log(VDNA_metab*summary_experiment$molecules_T))) #molecules/ug
neffT <- (log(mg_psT)+log(K)-log(mg_m0sT))/log(mg_lam_s_p1)
#M_G
mg_psG <- exp(mean(log(summary_experiment$prop_obsG)))
mg_m0sG <- exp(mean(log(VDNA_metab*summary_experiment$molecules_G))) #molecules/ug
neffG <- (log(mg_psG)+log(K)-log(mg_m0sG))/log(mg_lam_s_p1)
Cycles <- 1:40
res_exp <- NULL
for (cyc in Cycles){
m0s <- summary_experiment$prop_obsU/
(1+summary_experiment$efficiencies)^cyc
psU <- m0s/sum(m0s)
m0s <- summary_experiment$prop_obsT/
(1+summary_experiment$efficiencies)^cyc
psT <- m0s/sum(m0s)
m0s <- summary_experiment$prop_obsG/
(1+summary_experiment$efficiencies)^cyc
psG <- m0s/sum(m0s)
res_exp <- rbind(res_exp, data.frame(sp = summary_experiment$sp,
cycle = cyc,
psU = psU, psT = psT, psG = psG))
}
# res_exp %>% View
ggplot(res_exp)+
geom_line(aes(cycle, psU, col = sp))+
geom_hline(yintercept = 1/13)+
geom_vline(xintercept = neffU)
ggplot(res_exp)+
geom_line(aes(cycle, psT, col = sp))
ggplot(res_exp)+
geom_line(aes(cycle, psG, col = sp))
RMSE (root mean squared error) to compare observed/corrected vs expected proportions Either absolute or relative error
AbsErr <- function(p_observed, p_expected){
sqrt(sum((p_observed-p_expected)^2)/length(p_observed))
}
RelErr <- function(p_observed, p_expected){
sqrt(sum(((p_observed-p_expected)/p_expected)^2)/length(p_observed))
}
#U
summary(resU1$prop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.02565 0.05499 0.07852 0.07646 0.08630 0.17207
AbsErr(resU1$prop, resU1$prop_expected)
## [1] 0.03702942
RelErr(resU1$prop, resU1$prop_expected)
## [1] 0.4813825
#T
summary(resT1$prop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01454 0.02758 0.04289 0.07551 0.07932 0.26184
AbsErr(resT1$prop, resT1$prop_expected)
## [1] 0.04465347
AbsErr(resT1$prop_infer_mock, resT1$prop_expected)
## [1] 0.01741129
AbsErr(resT1$prop_infer_Lambda, resT1$prop_expected)
## [1] 0.01837216
RelErr(resT1$prop, resT1$prop_expected)
## [1] 0.5269626
RelErr(resT1$prop_infer_mock, resT1$prop_expected)
## [1] 0.2631424
RelErr(resT1$prop_infer_Lambda, resT1$prop_expected)
## [1] 0.2798953
#RMSE of T if it is considered as a uniform community
AbsErr(resT1$prop, resU1$prop_expected)
## [1] 0.07382099
RelErr(resT1$prop, resU1$prop_expected)
## [1] 0.9596729
#G
summary(resG1$prop)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000450 0.0003027 0.0060311 0.0762360 0.0519728 0.4009659
AbsErr(resG1$prop, resG1$prop_expected)
## [1] 0.05724279
AbsErr(resG1$prop_infer_mock, resG1$prop_expected)
## [1] 0.02026261
AbsErr(resG1$prop_infer_Lambda, resG1$prop_expected)
## [1] 0.01914612
RelErr(resG1$prop, resG1$prop_expected)
## [1] 0.4930014
RelErr(resG1$prop_infer_mock, resG1$prop_expected)
## [1] 0.3358845
RelErr(resG1$prop_infer_Lambda, resG1$prop_expected)
## [1] 0.3322317
Inference at neffU : if the amplification is exponential and is stopped after neffU cycles, the estimated proportions are psU, psT and psG (below). Associated RMSE.
m0s <- summary_experiment$prop_obsU/
(1+summary_experiment$efficiencies)^neffU
psU <- m0s/sum(m0s)
m0s <- summary_experiment$prop_obsT/
(1+summary_experiment$efficiencies)^neffU
psT <- m0s/sum(m0s)
m0s <- summary_experiment$prop_obsG/
(1+summary_experiment$efficiencies)^neffU
psG <- m0s/sum(m0s)
AbsErr(psU, summary_experiment$propU)
## [1] 0.003087341
RelErr(psU, summary_experiment$propU)
## [1] 0.04013544
AbsErr(psT, summary_experiment$propT)
## [1] 0.01808665
RelErr(psT, summary_experiment$propT)
## [1] 0.2917926
AbsErr(psG, summary_experiment$propG)
## [1] 0.01740845
RelErr(psG, summary_experiment$propG)
## [1] 0.3277107
###Associated biodiversity measures: Hill numbers
Computing the Hill numbers to evaluate the influence of correction on ecological conclusions.
#Hill
D <- function(ps, q){
ps <- ps/sum(ps)
ps <- ps[ps>0]
ifelse(q == 1, exp(-sum(ps*log(ps))), sum(ps^q)^(1/(1-q)))
}
#Community T
D(resT1$prop_expected, 1)
## [1] 11.26119
D(resT1$prop, 1)
## [1] 8.868797
D(resT1$prop_infer_mock, 1)
## [1] 10.64034
D(resT1$prop_infer_Lambda, 1)
## [1] 10.63284
D(resT1$prop_expected, 2)
## [1] 10.02984
D(resT1$prop, 2)
## [1] 6.647851
D(resT1$prop_infer_mock, 2)
## [1] 9.125834
D(resT1$prop_infer_Lambda, 2)
## [1] 9.105885
#Community G
D(resG1$prop_expected, 1)
## [1] 3.995114
D(resG1$prop, 1)
## [1] 3.70669
D(resG1$prop_infer_mock, 1)
## [1] 3.733344
D(resG1$prop_infer_Lambda, 1)
## [1] 3.806799
D(resG1$prop_expected, 2)
## [1] 2.999268
D(resG1$prop, 2)
## [1] 3.089144
D(resG1$prop_infer_mock, 2)
## [1] 2.784234
D(resG1$prop_infer_Lambda, 2)
## [1] 2.845309
Plot proportions for G
ginferG1 <- ggplot(df_targetG)+
geom_boxplot(aes(factor(sp, levels = order_short,
ordered = T),
prop))+
geom_point(data = summary_experiment,
aes(sp, propG, fill = "Expected"), shape = 23,
size = 2)+
geom_point(data = resG1, aes(sp, prop_infer_Lambda, fill = "Inferred\nwith model"),
shape = 23, size = 2)+
labs(y = "Proportions", x = "Species",
fill = "", col = "")+
scale_fill_manual(values = colp)+
ggtitle("Abundances in the Geometric community (G)")+
theme(plot.title = element_text(hjust = 0.5))+
scale_y_log10()+
scale_x_discrete(labels= function(x) highlight(x, "Cb|Fe"))+
theme(axis.text.x=element_markdown())
ginferG1
# ggsave("Figures/metabar_results_corrected_G.png", units = "mm",
# width = 150, height = 90, dpi = 600)
Simulated amplification to quantify the effect of efficiency differences.
pcr_sim : Function to simulate PCR with multiple species
pcr_sim <- function(m0=1,
lambda=rep(1, length(m0)),
K=1e11,
cycles = 40,
satu = c("log", "meca"), #meca not introduced in paper
c = 0.9, #for meca
e = 1.1, #for meca
delta = 1) { #for initial overdispersion
param <- data.frame(m0 = m0,
lambda = lambda)
nspecies <- nrow(param)
#Number of molecules at each cycle for each species
kinetics <- matrix(0, nrow = cycles+1, ncol = nspecies)
if (delta <= 1){
for (m in 1:nspecies){
kinetics[1,m] <- rpois(1, m0[m])
}
} else {
for (m in 1:nspecies){
kinetics[1,m] <- rnbinom(1, m0[m]/delta, 1/delta)
}
}
for (cyc in 2:(1+cycles)) {
charge <- sum(kinetics[cyc-1,]-kinetics[1,])/K
for (m in 1:nspecies) {
lam <- lambda[m]
mc <- kinetics[cyc-1,m] #Molecules of s at previous cycle
if(satu[1] == "meca"){ #This model is not introduced in the paper
gamma <- function(x){
2/(1+sqrt(1-4*c^2*x*(1-x)))
}
lam <- lam*c*gamma(charge)*(1-charge)*(e-charge)
}
else{ #logistic
lam <- max(0, lam*(1-charge))
}
#Number of created molecules
nb <- rbinom(1, mc, lam)
kinetics[cyc,m] <- kinetics[cyc-1,m]+nb
}
}
return(kinetics)
}
nspecies <- 2
ncycles <- 40
M0 = rep(2.5e5/nspecies, nspecies) #Total number of molecules: 2.5e5
#Tested values
lam1 <- 1 #Efficiency of species 1
Lam2 <- seq(0.8, 1, by = 0.001) #Different efficiencies of species 2
res <- NULL
for (lam2 in Lam2){
p <- pcr_sim(m0=M0, lambda=c(lam1, lam2), K=K,
cycles = ncycles)
res <- rbind(res, p[ncycles+1,])
}
#Proportions of the 2 species
prop2 <- df_targetU %>% group_by(sp) %>%
summarise(propU = median(prop)) %>% ungroup() %>%
mutate(propU = propU/(propU+propU[11])) %>% #11 = Rca
left_join(summary_experiment %>% dplyr::select(sp, efficiencies)) %>%
mutate(deff = 1-efficiencies) %>% filter(sp != "Rc")
## Joining with `by = join_by(sp)`
Figure:
#Labels positions
prop2$labelx <- prop2$deff - 0.0021
prop2[prop2$sp == "Rfe","labelx"] <- prop2[prop2$sp == "Rfe","labelx"]+
0.0042
prop2[prop2$sp == "Lco","labelx"] <- prop2[prop2$sp == "Lco","labelx"]+
0.0042
prop2[prop2$sp == "Lxy","labelx"] <- prop2[prop2$sp == "Lxy","labelx"]+
0.0042
prop2[prop2$sp == "Aal","labelx"] <- prop2[prop2$sp == "Aal","labelx"]+
0.0038
prop2[prop2$sp == "Bme","labelx"] <- prop2[prop2$sp == "Bme","labelx"]+
0.0037
prop2[prop2$sp == "Cbe","labelx"] <- prop2[prop2$sp == "Cbe","labelx"]+
0.0021
ggplot()+
geom_vline(data = prop2 %>% filter(sp != "Rca"), aes(xintercept = deff),
col = "grey80")+
geom_line(aes(1-Lam2/lam1, res[,1]/(rowSums(res)), col = "Rosa canina"))+
geom_line(aes(1-Lam2/lam1, res[,2]/(rowSums(res)), col = "other"))+
geom_hline(yintercept = 0.5, linetype = "dashed")+
# ggtitle("Simulated mock communities of two species with equal starting quantities")+
# theme(plot.title = element_text(hjust = 0.5))+
geom_point(data = prop2 %>% filter(sp != "Rca"),
aes(deff, propU, col = "other"))+
geom_point(data = prop2 %>% filter(sp != "Rca"),
aes(deff, 1-propU, col = "Rosa canina"))+
geom_text(aes(0.135, 0.73, label = "Rosa canina", col = "Rosa canina"),
size = 4)+
geom_text(aes(0.135, 0.27, label = "Second species", col = "other"),
size = 4)+
geom_text(data = prop2 %>%
filter(sp != "Rca") %>% arrange(efficiencies),
aes(labelx, 1, label = sp),
col = "grey50",
size = 3, angle = 90)+
geom_text(data = prop2 %>% filter(sp %in% c("Cbe", "Cbp", "Fex")) %>%
arrange(efficiencies),
aes(labelx, 1, label = sp),
col = "grey25",
size = 3, angle = 90)+
labs(x = "Relative decrease of PCR efficiency compared to Rosa canina",
y = "Final relative abundances", col = "Species")+
coord_cartesian(xlim = c(0.006, 0.185), ylim = c(0,1))+
theme(legend.position = "none")+
scale_colour_brewer(palette = "Set1")
<<<<<<< HEAD
# geom_text_repel(
# data = prop2 %>% filter(sp != "Rca"),
# aes(deff, propU, label = sp),
# size = 4,
# min.segment.length = unit(0, 'lines'))
# rep(c(1,0.95), 6)
# ggsave(filename = "Figures/simu_efficiencies.pdf", units = "mm",
# height = 100, width = 150, device = "pdf", dpi = 600)
Mass of one genome for the 13 species.
summary_experiment$Cvalue <- c(0.46, 0.45, 1.03, 0.84, 0.70, 17.29,
1.02, 6.35, 1.40, 0.40, 1.76, 0.74, 0.87)
summary_experiment <- summary_experiment %>%
mutate(copies_genome = Molecules_ng*1e9*Cvalue*1e-12)
summary_experiment
## # A tibble: 13 × 21
## Sample species_name sp Molecules_ng CDNA_sample Volume_equi cDNA_U
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 4 Salvia_pratensis Spr 152880 24.4 1.62 0.0624
## 2 6 Populus_tremula Ptr 248096 31.4 1 0.0385
## 3 10 Carpinus_betulus Cbe 52667. 9.14 4.71 0.181
## 4 12 Fraxinus_excelsior Fex 87040 22.4 2.85 0.110
## 5 16 Lonicera_xylosteum Lxy 73740 45.8 3.36 0.129
## 6 18 Abies_alba Aal 89653. 3.58 2.77 0.106
## 7 20 Acer_campestre Aca 134683. 12.2 1.84 0.0708
## 8 22 Briza_media Bme 125573. 183 1.98 0.0760
## 9 24 Rosa_canina Rca 211640 50.8 1.17 0.0451
## 10 26 Capsella_bursa-past… Cbp 38533. 38.8 6.44 0.248
## 11 28 Geranium_robertianum Gro 82293. 15 3.01 0.116
## 12 30 Rhododendron_ferrug… Rfe 37483. 3.9 6.62 0.255
## 13 32 Lotus_corniculatus Lco 140480 65.2 1.77 0.0679
## # ℹ 14 more variables: molecules_U <dbl>, propU <dbl>, RankG <dbl>,
## # molecules_T <dbl>, propT <dbl>, cDNA_G <dbl>, molecules_G <dbl>,
## # propG <dbl>, efficiencies <dbl>, prop_obsU <dbl>, prop_obsT <dbl>,
## # prop_obsG <dbl>, Cvalue <dbl>, copies_genome <dbl>
summary_experiment %>% arrange(RankG)
## # A tibble: 13 × 21
## Sample species_name sp Molecules_ng CDNA_sample Volume_equi cDNA_U
## <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 22 Briza_media Bme 125573. 183 1.98 0.0760
## 2 24 Rosa_canina Rca 211640 50.8 1.17 0.0451
## 3 32 Lotus_corniculatus Lco 140480 65.2 1.77 0.0679
## 4 6 Populus_tremula Ptr 248096 31.4 1 0.0385
## 5 4 Salvia_pratensis Spr 152880 24.4 1.62 0.0624
## 6 16 Lonicera_xylosteum Lxy 73740 45.8 3.36 0.129
## 7 12 Fraxinus_excelsior Fex 87040 22.4 2.85 0.110
## 8 20 Acer_campestre Aca 134683. 12.2 1.84 0.0708
## 9 26 Capsella_bursa-past… Cbp 38533. 38.8 6.44 0.248
## 10 28 Geranium_robertianum Gro 82293. 15 3.01 0.116
## 11 10 Carpinus_betulus Cbe 52667. 9.14 4.71 0.181
## 12 18 Abies_alba Aal 89653. 3.58 2.77 0.106
## 13 30 Rhododendron_ferrug… Rfe 37483. 3.9 6.62 0.255
## # ℹ 14 more variables: molecules_U <dbl>, propU <dbl>, RankG <dbl>,
## # molecules_T <dbl>, propT <dbl>, cDNA_G <dbl>, molecules_G <dbl>,
## # propG <dbl>, efficiencies <dbl>, prop_obsU <dbl>, prop_obsT <dbl>,
## # prop_obsG <dbl>, Cvalue <dbl>, copies_genome <dbl>
100 simulated communities for each of the 3 M0tot values with 13 species
###Varying initial quantity, final abundances
The initial number of molecules varies in the M_U community:
#Different values of total number of molecules at the beginning
M0tot <- 2.5e5*c(1e-1, 1, 1e1)
nsim <- 100
compo <- NULL
for (sim in 1:nsim){
for (m0tot in M0tot){
#Composition of this community
MU_varM0 <- pcr_sim(m0=rep(m0tot/nrow(summary_experiment),
nrow(summary_experiment)),
lambda=summary_experiment$efficiencies,
K=K,
cycles = ncycles)
compo <- rbind(compo, data.frame(sim = sim,
M0tot = m0tot,
sp = summary_experiment$sp,
ps = MU_varM0[nrow(MU_varM0),]/
sum(MU_varM0[nrow(MU_varM0),])))
}
}
ggplot(compo %>% filter(sp %in% c("Rca", "Gro")))+
geom_boxplot(aes(factor(M0tot), ps, col = sp))+
geom_point(data = data.frame(),
aes(factor(M0tot[2]),
(summary_experiment %>%
filter(sp %in% c("Rca", "Gro")) %>% pull(prop_obsU))),
shape = 23, fill = "gold")+
scale_color_brewer(palette = "Set1")
<<<<<<< HEAD
#adjust positions for real proportions...
compo %>% filter(sp %in% c("Rca", "Gro")) %>% group_by(M0tot, sp) %>%
summarise(mps = mean(ps))
## `summarise()` has grouped output by 'M0tot'. You can override using the
## `.groups` argument.
## # A tibble: 6 × 3
## # Groups: M0tot [3]
## M0tot sp mps
## <dbl> <chr> <dbl>
## 1 25000 Gro 0.0195
<<<<<<< HEAD
## 2 25000 Rca 0.192
=======
## 2 25000 Rca 0.193
>>>>>>> eb7f3054bbf0321141b9b1a66ddeff779bd19d4a
## 3 250000 Gro 0.0243
## 4 250000 Rca 0.171
## 5 2500000 Gro 0.0300
## 6 2500000 Rca 0.152
Data are generated with the logistic model. Corrections are brought with our approach and with the ratio method (assuming exponential model)
compo_ref <- compo %>% filter(M0tot == 2.5e5) %>% group_by(sp) %>%
summarise(med_ps = median(ps)) %>%
mutate(correc = med_ps/(1/nrow(summary_experiment)))
#observed ps/expected ps
#with expected ps = 1/13
compo <- compo %>% left_join(summary_experiment[,c("sp", "species_name")]) %>%
left_join(compo_ref) %>%
mutate(ps_ratio = ps/correc)
## Joining with `by = join_by(sp)`
## Joining with `by = join_by(sp)`
compo <- compo %>% left_join(
compo %>% group_by(sim, M0tot) %>%
summarise(sum_ps_ratio = sum(ps_ratio)) ) %>%
mutate(ps_ratio = ps_ratio/sum_ps_ratio) %>%
dplyr::select(-sum_ps_ratio)
## `summarise()` has grouped output by 'sim'. You can override using the `.groups`
## argument.
## Joining with `by = join_by(sim, M0tot)`
Export to Julia (for inference)
for(i in 1:length(M0tot)){
m0tot <- M0tot[i]
mat <- compo %>% filter(M0tot == m0tot) %>%
dplyr::select(sim, species_name, ps) %>%
pivot_wider(names_from = species_name, values_from = ps) %>%
dplyr::select(colnames(readsU)) %>% as.matrix()
# write.csv(mat, paste0("data/sim100_m0tot_", i, ".csv"),row.names = F)
}
# df_targetU2 <- df_targetU %>% left_join(correcU)
#
# ggplot(compo)+
# geom_boxplot(data = df_targetU2, aes(sp, prop, col = "MU"),
# alpha = 0.2)+
# geom_boxplot(data = df_targetU2, aes(sp, prop/correc, col = "MU_correc"),
# alpha = 0.2)+
# geom_boxplot(aes(sp, ps, col = "sim"), alpha = 0.2)+
# geom_boxplot(aes(sp, ps/correc, col = "sim_correc"), alpha = 0.2)+
# facet_wrap(.~M0tot, ncol = 1)+
# geom_hline(yintercept = 1/13)
Inference results computed in Julia
res_commu1 <- read.csv("data/res_commu1.csv")
res_commu2 <- read.csv("data/res_commu2.csv")
res_commu3 <- read.csv("data/res_commu3.csv")
colnames(res_commu1) <- colnames(readsU)
colnames(res_commu2) <- colnames(readsU)
colnames(res_commu3) <- colnames(readsU)
res_commu1 <- res_commu1 %>% as_tibble() %>%
mutate(sim = 1:100, M0tot = 2.5e4) %>%
pivot_longer(1:13, names_to = "species_name", values_to = "ps_log") %>%
left_join(summary_experiment %>% dplyr::select(sp, species_name))
## Joining with `by = join_by(species_name)`
res_commu2 <- res_commu2 %>% as_tibble() %>%
mutate(sim = 1:100, M0tot = 2.5e5) %>%
pivot_longer(1:13, names_to = "species_name", values_to = "ps_log") %>%
left_join(summary_experiment %>% dplyr::select(sp, species_name))
## Joining with `by = join_by(species_name)`
res_commu3 <- res_commu3 %>% as_tibble() %>%
mutate(sim = 1:100, M0tot = 2.5e6) %>%
pivot_longer(1:13, names_to = "species_name", values_to = "ps_log") %>%
left_join(summary_experiment %>% dplyr::select(sp, species_name))
## Joining with `by = join_by(species_name)`
res_infer_log <- rbind(res_commu1, res_commu2, res_commu3)
Supplementary Figure 3
to compare the two approaches
ggplot(compo)+
geom_boxplot(aes(factor(sp, levels = order_short, ordered = T),
ps_ratio,
group = sp, col = "Ratio"), alpha = 0.1)+
geom_boxplot(data = res_infer_log, aes(sp, ps_log,
group = sp, col = "Logistic model"), alpha = 0.1)+
geom_point(data = summary_experiment,
aes(sp, 1/nrow(summary_experiment),
fill = "Expected\nwithout\nPCR bias"),
shape = 23,
size = 3)+
facet_wrap(M0tot~., ncol = 1)+
# scale_color_manual(values = colp)+
scale_fill_manual(values = colp)+
labs(x = "Species", y = "Corrected reads proportions",
fill = "", col = "Correction")+
scale_color_brewer(palette = "Set1")+
ggtitle(TeX("$M_0^{{total}}$")) +
theme(legend.title = element_text(hjust = 0.5),
plot.title = element_text(hjust = 0.5),
axis.text.x=element_markdown())+
scale_x_discrete(labels = function(x) highlight(x, "Cb|Fe"))
<<<<<<< HEAD
# ggsave(filename = "Figures/simu_exp_log.pdf",
# dpi = 600, units = "mm",
# width = 150, height = 180, device = "pdf")
###Data
param <- read.csv("data/res_infer_Lambda_K.csv")
colnames(param) <- c(colnames(readsU)[-1], "lK")
param$K <- 10^param$lK*1e13
median(param$K)/1e11
## [1] 1.325024
param <- param %>% pivot_longer(1:(nrow(summary_experiment)-1),
names_to = "species_name",
values_to = "efficiencies")
param <- param %>%
left_join(summary_experiment %>% dplyr::select(species_name, sp))
## Joining with `by = join_by(species_name)`
###Supplementary Fig 1: K = f(Lambda_s)
ggplot(param)+
geom_point(aes(K, efficiencies, col = sp))+
# geom_density(aes(K, y = ..scaled..))+
# scale_y_continuous(sec.axis = sec_axis(~./5-1, name="Density")) +
# geom_hline(aes(yintercept = 1, col = "Rca"))+
geom_segment(aes(x = min(param$K), xend = max(param$K), y = 1, yend = 1,
col = "Rca"))+
geom_vline(xintercept = median(param$K), linetype = "dashed")+
scale_x_log10()+
labs(y = TeX("Efficiencies $\\Lambda_s$"), col = "Species")+
theme(
legend.title.align=0.5,
legend.text.align = 0.5)
# ggsave(filename = "Figures/Lambda_fctK.pdf",
# dpi = 600, units = "mm",
# width = 150, height = 100, device = "pdf")
=======
ggsave(filename = "Figures/Lambda_fctK.pdf",
dpi = 600, units = "mm",
width = 150, height = 100, device = "pdf")
>>>>>>> eb7f3054bbf0321141b9b1a66ddeff779bd19d4a
###Supplementary Fig 2: hist(K)
ggplot(param)+
geom_histogram(aes(K, y = ..count../sum(..count..)), bins = 15)+
scale_x_log10()+
labs(y = "Frequency")
# ggsave(filename = "Figures/histK.pdf",
# dpi = 600, units = "mm",
# width = 150, height = 100, device = "pdf")
Molar mass of 1 dNTP: \(M=327g/mol\) in average
Molar concentration of the 4 dNTP in well: \(c_n=1 mM = 1.10^{-3}mol/l\)
Quantity of primers in well: \(n_p= 20pmol\)/PCR ?
Volume of well: \(V = 20 \mu l\)
Constant of Avogadro \(N_A = 6.02214076\times 10^{23} \: mol^{-1}\)
Average length of a sequence: \(L=50\) dNTP
Thus, \(K=\frac{min(c_n\times V, \: n_p)}{L}\times N_A\)
This is the maximum number of molecules that can be created in a well. Around 1e14.
c_n = 1e-3 #concentration of nucleotides, mol/l
Vwell = 20e-6 #volume of a well, l
N_A = 6.02214076e23 #Avogadro
L = barcodes$sequence %>% nchar %>% mean
#mean length of a sequence, number of nucleotides
n_p = 1e-8 #quantity of primers, mol
Ksup = min(c_n*Vwell, n_p)/L*N_A
Ksup
## [1] 1.182357e+14